{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BeautifulSoup 的find/find_all基本用法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# coding = utf-8\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "html = \"\"\"\n",
    "<table class=\"tablelist\" cellpadding=\"0\" cellspacing=\"0\">\n",
    "        <tbody><tr class=\"h\">\n",
    "            <td class=\"l\" width=\"374\">职位名称</td>\n",
    "            <td>职位类别</td>\n",
    "            <td>人数</td>\n",
    "            <td>地点</td>\n",
    "            <td>发布时间</td>\n",
    "        </tr>\n",
    "                        <tr class=\"even\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\">TME-全民K歌数据产品经理</a></td>\n",
    "            <td>产品/项目类</td>\n",
    "            <td>2</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"odd\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47442&amp;keywords=python&amp;tid=0&amp;lid=0\">TME-全民K歌高级数据产品经理</a></td>\n",
    "            <td>产品/项目类</td>\n",
    "            <td>1</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"even\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>1</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"odd\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47423&amp;keywords=python&amp;tid=0&amp;lid=0\">TEG02-网络运维工程师</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>1</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"even\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>2</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"odd\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47396&amp;keywords=python&amp;tid=0&amp;lid=0\">PCG11-后台开发工程师（北京）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>1</td>\n",
    "            <td>北京</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"even\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>2</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"odd\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47380&amp;keywords=python&amp;tid=0&amp;lid=0\">22989-腾讯云serverless运营开发工程师（成都）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>2</td>\n",
    "            <td>成都</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "                        <tr class=\"even\">\n",
    "            <td class=\"l square\"><a target=\"_blank\" href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\">18436-NLP算法研究员（深圳）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>1</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "        <tr class=\"odd\">\n",
    "            <td class=\"l square\"><a class=\"test\" id=\"test\" target=\"_blank\" href=\"position_detail.php?id=47359&amp;keywords=python&amp;tid=0&amp;lid=0\">PCG17-QQ钱包后台开发工程师（深圳）</a></td>\n",
    "            <td>技术类</td>\n",
    "            <td>1</td>\n",
    "            <td>深圳</td>\n",
    "            <td>2019-02-03</td>\n",
    "        </tr>\n",
    "    </tbody>\n",
    "</table>\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(html, 'lxml')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取所有tr标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"h\">\n",
      "<td class=\"l\" width=\"374\">职位名称</td>\n",
      "<td>职位类别</td>\n",
      "<td>人数</td>\n",
      "<td>地点</td>\n",
      "<td>发布时间</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47442&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌高级数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47423&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TEG02-网络运维工程师</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47396&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">PCG11-后台开发工程师（北京）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>北京</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47380&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（成都）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>成都</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">18436-NLP算法研究员（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a class=\"test\" href=\"position_detail.php?id=47359&amp;keywords=python&amp;tid=0&amp;lid=0\" id=\"test\" target=\"_blank\">PCG17-QQ钱包后台开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "trs = soup.find_all('tr')\n",
    "for tr in trs:\n",
    "    print(tr)\n",
    "#     print(type(tr)) # 类型为 <class 'bs4.element.Tag'>\n",
    "    print('=' * 30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取第2个tr标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n"
     ]
    }
   ],
   "source": [
    "tr = soup.find_all('tr', limit=2)[1]\n",
    "print(tr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取所有class等于even的tr标签"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法1（注意class => class_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">18436-NLP算法研究员（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "trs = soup.find_all('tr', class_='even')\n",
    "for tr in trs:\n",
    "    print(tr)\n",
    "    print('='*30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法2:使用atrribute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">18436-NLP算法研究员（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "trs = soup.find_all('tr', attrs={'class':'even'})\n",
    "for tr in trs:\n",
    "    print(tr)\n",
    "    print('='*30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 将所有ID等于test,class也等于test的a标签提取出来"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<a class=\"test\" href=\"position_detail.php?id=47359&amp;keywords=python&amp;tid=0&amp;lid=0\" id=\"test\" target=\"_blank\">PCG17-QQ钱包后台开发工程师（深圳）</a>\n"
     ]
    }
   ],
   "source": [
    "aList = soup.find_all('a',id=\"test\",class_=\"test\")\n",
    "for a in aList:\n",
    "    print(a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法二\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<a class=\"test\" href=\"position_detail.php?id=47359&amp;keywords=python&amp;tid=0&amp;lid=0\" id=\"test\" target=\"_blank\">PCG17-QQ钱包后台开发工程师（深圳）</a>\n"
     ]
    }
   ],
   "source": [
    "aList = soup.find_all('a',attrs={\"id\":\"test\",\"class\":\"test\"})\n",
    "for a in aList:\n",
    "    print(a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取所有a标签的href属性"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过下标获取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "position_detail.php?id=47441&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47442&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47428&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47423&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47411&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47396&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47379&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47380&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47374&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47359&keywords=python&tid=0&lid=0\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "aList = soup.find_all('a')\n",
    "for a in aList:\n",
    "    # 5.1 通过下标的方式获取\n",
    "    href = a[\"href\"]\n",
    "    print(href)\n",
    "    print('='*30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过attrs属性的方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "position_detail.php?id=47441&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47442&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47428&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47423&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47411&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47396&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47379&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47380&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47374&keywords=python&tid=0&lid=0\n",
      "==============================\n",
      "position_detail.php?id=47359&keywords=python&tid=0&lid=0\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "aList = soup.find_all('a')\n",
    "for a in aList:\n",
    "    # 5.1 通过下标的方式获取\n",
    "    href = a.attrs[\"href\"]\n",
    "    print(href)\n",
    "    print('='*30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取所有的职位信息并且是纯文本"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法一：string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'p_name': 'TME-全民K歌数据产品经理', 'city': '深圳', 'num': '2', 'p_type': '产品/项目类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': 'TME-全民K歌高级数据产品经理', 'city': '深圳', 'num': '1', 'p_type': '产品/项目类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': '30628-腾讯广告算法高级工程师（研发中心-深圳）', 'city': '深圳', 'num': '1', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': 'TEG02-网络运维工程师', 'city': '深圳', 'num': '1', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': '22989-腾讯云资深运营开发工程师（深圳）', 'city': '深圳', 'num': '2', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': 'PCG11-后台开发工程师（北京）', 'city': '北京', 'num': '1', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': '22989-腾讯云serverless运营开发工程师（深圳总部）', 'city': '深圳', 'num': '2', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': '22989-腾讯云serverless运营开发工程师（成都）', 'city': '成都', 'num': '2', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': '18436-NLP算法研究员（深圳）', 'city': '深圳', 'num': '1', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n",
      "{'p_name': 'PCG17-QQ钱包后台开发工程师（深圳）', 'city': '深圳', 'num': '1', 'p_type': '技术类', 'pub_time': '2019-02-03'}\n"
     ]
    }
   ],
   "source": [
    "position_list = []\n",
    "position_dict = {}\n",
    "trs = soup.find_all('tr')[1:]\n",
    "for tr in trs:\n",
    "    tds = tr.find_all('td')\n",
    "    position_dict[\"p_name\"] = tds[0].string\n",
    "    position_dict[\"p_type\"] = tds[1].string\n",
    "    position_dict[\"num\"] = tds[2].string\n",
    "    position_dict[\"city\"] = tds[3].string\n",
    "    position_dict[\"pub_time\"] = tds[4].string\n",
    "    position_list.append(position_dict)\n",
    "    position_dict = {}\n",
    "    \n",
    "for p in position_list:\n",
    "    print(p)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法二：(tr.strings:返回带换行符的数组)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['\\n', 'TME-全民K歌数据产品经理', '\\n', '产品/项目类', '\\n', '2', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', 'TME-全民K歌高级数据产品经理', '\\n', '产品/项目类', '\\n', '1', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', '30628-腾讯广告算法高级工程师（研发中心-深圳）', '\\n', '技术类', '\\n', '1', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', 'TEG02-网络运维工程师', '\\n', '技术类', '\\n', '1', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', '22989-腾讯云资深运营开发工程师（深圳）', '\\n', '技术类', '\\n', '2', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', 'PCG11-后台开发工程师（北京）', '\\n', '技术类', '\\n', '1', '\\n', '北京', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', '22989-腾讯云serverless运营开发工程师（深圳总部）', '\\n', '技术类', '\\n', '2', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', '22989-腾讯云serverless运营开发工程师（成都）', '\\n', '技术类', '\\n', '2', '\\n', '成都', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', '18436-NLP算法研究员（深圳）', '\\n', '技术类', '\\n', '1', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n",
      "['\\n', 'PCG17-QQ钱包后台开发工程师（深圳）', '\\n', '技术类', '\\n', '1', '\\n', '深圳', '\\n', '2019-02-03', '\\n']\n"
     ]
    }
   ],
   "source": [
    "position_list = []\n",
    "position_dict = {}\n",
    "trs = soup.find_all('tr')[1:]\n",
    "for tr in trs:\n",
    "    infos = list(tr.strings)\n",
    "    print(infos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法三：stripped_strings\n",
    "注：这种方法，会把空的内容也去掉。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'p_name': 'TME-全民K歌数据产品经理', 'post_time': '2019-02-03', 'num': '2', 'p_type': '产品/项目类', 'area': '深圳'}\n",
      "{'p_name': 'TME-全民K歌高级数据产品经理', 'post_time': '2019-02-03', 'num': '1', 'p_type': '产品/项目类', 'area': '深圳'}\n",
      "{'p_name': '30628-腾讯广告算法高级工程师（研发中心-深圳）', 'post_time': '2019-02-03', 'num': '1', 'p_type': '技术类', 'area': '深圳'}\n",
      "{'p_name': 'TEG02-网络运维工程师', 'post_time': '2019-02-03', 'num': '1', 'p_type': '技术类', 'area': '深圳'}\n",
      "{'p_name': '22989-腾讯云资深运营开发工程师（深圳）', 'post_time': '2019-02-03', 'num': '2', 'p_type': '技术类', 'area': '深圳'}\n",
      "{'p_name': 'PCG11-后台开发工程师（北京）', 'post_time': '2019-02-03', 'num': '1', 'p_type': '技术类', 'area': '北京'}\n",
      "{'p_name': '22989-腾讯云serverless运营开发工程师（深圳总部）', 'post_time': '2019-02-03', 'num': '2', 'p_type': '技术类', 'area': '深圳'}\n",
      "{'p_name': '22989-腾讯云serverless运营开发工程师（成都）', 'post_time': '2019-02-03', 'num': '2', 'p_type': '技术类', 'area': '成都'}\n",
      "{'p_name': '18436-NLP算法研究员（深圳）', 'post_time': '2019-02-03', 'num': '1', 'p_type': '技术类', 'area': '深圳'}\n",
      "{'p_name': 'PCG17-QQ钱包后台开发工程师（深圳）', 'post_time': '2019-02-03', 'num': '1', 'p_type': '技术类', 'area': '深圳'}\n"
     ]
    }
   ],
   "source": [
    "position_list = []\n",
    "position_dict = {}\n",
    "trs = soup.find_all('tr')[1:]\n",
    "for tr in trs:\n",
    "        infos = list(tr.stripped_strings)\n",
    "        # print(infos)\n",
    "        position_dict[\"p_name\"] = infos[0]\n",
    "        position_dict['p_type'] = infos[1]\n",
    "        position_dict['num'] = infos[2]\n",
    "        position_dict['area'] = infos[3]\n",
    "        position_dict['post_time'] = infos[4]\n",
    "        position_list.append(position_dict)\n",
    "        position_dict = {}\n",
    "        \n",
    "for p in position_list:\n",
    "    print(p)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# BeautifulSoup css选择器select的用法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取所有tr标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"h\">\n",
      "<td class=\"l\" width=\"374\">职位名称</td>\n",
      "<td>职位类别</td>\n",
      "<td>人数</td>\n",
      "<td>地点</td>\n",
      "<td>发布时间</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47442&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌高级数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47423&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TEG02-网络运维工程师</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47396&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">PCG11-后台开发工程师（北京）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>北京</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47380&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（成都）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>成都</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">18436-NLP算法研究员（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n",
      "<tr class=\"odd\">\n",
      "<td class=\"l square\"><a class=\"test\" href=\"position_detail.php?id=47359&amp;keywords=python&amp;tid=0&amp;lid=0\" id=\"test\" target=\"_blank\">PCG17-QQ钱包后台开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "========================================\n"
     ]
    }
   ],
   "source": [
    "soup = BeautifulSoup(html, 'lxml')\n",
    "trs = soup.select('tr')\n",
    "for tr in trs:\n",
    "    print(tr)\n",
    "    print('='*40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取第2个tr标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "2.获取第2个tr标签\n",
    ":return:\n",
    "\"\"\"\n",
    "soup = BeautifulSoup(html, 'lxml')\n",
    "tr = soup.select('tr')[1]\n",
    "print(tr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47441&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">TME-全民K歌数据产品经理</a></td>\n",
      "<td>产品/项目类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "============================================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47428&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">30628-腾讯广告算法高级工程师（研发中心-深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "============================================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47411&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云资深运营开发工程师（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "============================================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47379&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">22989-腾讯云serverless运营开发工程师（深圳总部）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>2</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "============================================================\n",
      "<tr class=\"even\">\n",
      "<td class=\"l square\"><a href=\"position_detail.php?id=47374&amp;keywords=python&amp;tid=0&amp;lid=0\" target=\"_blank\">18436-NLP算法研究员（深圳）</a></td>\n",
      "<td>技术类</td>\n",
      "<td>1</td>\n",
      "<td>深圳</td>\n",
      "<td>2019-02-03</td>\n",
      "</tr>\n",
      "============================================================\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "3.获取所有class等于even的tr标签\n",
    ":return:\n",
    "\"\"\"\n",
    "soup = BeautifulSoup(html, 'lxml')\n",
    "# trs = soup.select('tr.even')\n",
    "# trs = soup.select('.even')  # 因为class=\"even\" 的只有tr标签\n",
    "trs = soup.select(\"tr[class='even']\")\n",
    "for tr in trs:\n",
    "    print(tr)\n",
    "    print('==' * 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "273px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
